## This script assumes a Windows machine with 4 cores.  Parallelization in
#  R is operating system-specific, so if you attempt ot run this code with
#  a different setup, it will be substantially slower.

install.packages("stringr")
install.packages("dplyr")
install.packages("lfe")
install.packages("ggplot2")
install.packages("betareg")
install.packages("rstan", repos = c("https://mc-stan.org/r-packages/", getOption("repos")))

library(stringr)
library(dplyr)
library(lfe)
library(ggplot2)
library(betareg)
library(rstan)
options(mc.cores = parallel::detectCores())
rstan_options(auto_write = FALSE)

## Set this directory to wherever you have stored the
#  measurement_replication folder on your system
setwd("D://Dropbox//Congressworks//drafts//measurement paper//measurement_replication")

# Load comparisons data
data <- read.csv(".//scaling//comparisons.csv")

## Compile stan model for scaling.  Rtools must be installed on your computer
#  in order to compile stan models.  If your Rtools installation is not up to
#  date, this step will fail.
stan.model <- stan_model(".//scaling//base_montgomery_model.stan")

## This function estimates all of the key parameters for the analysis with
#  the pairwise comparison Bayesian model.  By manipulating data to include
#  only a subset of all comparisons (for example, only those coded by the 
#  experts or only those coded by the RAs), it is possible to examine the
#  sensitivity of the results to using different coders.
#  data: Comparisons data, where each row consists of a pairwise comparison
#        between two texts
#  stan.model: The compiled stan model above, from bse_montgomery_model.stan
get_alphas <- function(data, stan.model){
  # Each text has an ID that can be parsed to find the associated hearing
  # ID and the associated legislator
  ids <- unique(c(data$id1, data$id2))
  hearings <- sapply(str_split(ids, "-"), function(x) x[2])
  legislators <- sapply(str_split(ids, "-"), function(x) x[3])
  
  M <- nrow(data)
  N <- length(unique(ids))
  
  # Construct map from comparisons to speeches
  y_map <- matrix(0, M, 2)
  for (m in 1:M){
    y_map[m,] <- c(which(ids == data$id1[m]), which(ids == data$id2[m]))
  }
  
  # Build data for model
  input <- list()
  input[["M"]] <- M
  input[["N"]] <- N

  input[["info"]] <- data$informational
  input[["conf"]] <- data$confrontational
  input[["y_map"]] <- y_map
  
  # Run model
  set.seed(9122022)
  stan.fit <- sampling(stan.model, input, cores = 4, iter = 10000,
                       verbose = TRUE)
  
  # Save the relevant parts of the output to a list
  out <- list()
  # Convergence statistics
  out[["convergence"]] <- summary(summary(stan.fit)$summary[,10])
  # The posterior distribution of the alphas, which characterize information and confrontation
  out[["alpha.infoconf.posterior"]] <- alpha.infoconf.posterior <- extract(stan.fit, "alpha_infoconf")[[1]]
  # Summary statistics for the posterior distribution of the information-correlation coefficient
  out[["infoconf_corr_results"]] <-  c(mean(extract(stan.fit, "infoconf_corr")[[1]]),
                                       quantile(extract(stan.fit, "infoconf_corr")[[1]], .025),
                                       quantile(extract(stan.fit, "infoconf_corr")[[1]], .975),
                                       mean(extract(stan.fit, "infoconf_corr")[[1]] > 0))
  
  return(out)
}

## Get results for expert coders
experts_out <- get_alphas(data[which(data$expert == 1),], stan.model)
experts_out[["convergence"]]

## Plot Results for Figure 2
experts.posterior.means <- data.frame(apply(experts_out[["alpha.infoconf.posterior"]], c(2,3), mean))
colnames(experts.posterior.means) <- c("Informational", "Confrontational")
ggplot(data = experts.posterior.means, aes(x=Informational,y=Confrontational)) + geom_point()

## Get results for research assistant coders
ras_out <- get_alphas(data[which(data$ra == 1),], stan.model)
ras_out[["convergence"]]

## Get results for student coders
students_out <- get_alphas(data[which(data$student == 1),], stan.model)
students_out[["convergence"]]

## Results for Table 1
experts_out[["infoconf_corr_results"]]
ras_out[["infoconf_corr_results"]]
students_out[["infoconf_corr_results"]]

## Get results for specific expert-to-audit comparisons from the section
#  ``Information and Confrontation in Oversight Hearings''
ras.posterior.means <- data.frame(apply(ras_out[["alpha.infoconf.posterior"]], c(2,3), mean))
colnames(ras.posterior.means) <- c("Informational", "Confrontational")

## Waxman-Greenspan
expert.ids <- with(data[which(data$expert == 1),], unique(c(id1, id2)))
ra.ids <- with(data[which(data$ra == 1),], unique(c(id1, id2)))

experts.posterior.means[which(expert.ids == "CHRG-110hhrg55764-14280-2"),]
ras.posterior.means[which(ra.ids == "CHRG-110hhrg55764-14280-2"),]

## Issa-Allen
experts.posterior.means[which(expert.ids == "CHRG-110hhrg37263-20107-3"),]
ras.posterior.means[which(ra.ids == "CHRG-110hhrg37263-20107-3"),]

## Estimates the supplementary analyses, accounting for both uncertainty
#  about the correct value of the alphas as well as uncertainty in the
#  estimates of the coefficients
#  data: Comparisons data, where each row consists of a pairwise comparison
#        between two texts
#  background_df: Supplemental legislator-level variables
#  alpha_posterior: A set of posterior draws, as given by the output of get_alphas
getResults <- function(data, background_df, alpha_posterior){
  ## Each text has an ID that can be parsed to find the associated hearing
  #  ID and the associated legislator
  ids <- unique(c(data$id1, data$id2))
  hearings <- sapply(str_split(ids, "-"), function(x) x[2])
  legislators <- sapply(str_split(ids, "-"), function(x) x[3])
  
  ## Get number of distinct comparisons, texts, hearings, and legislators
  M <- nrow(data)
  N <- length(unique(ids))
  H <- length(unique(hearings))
  L <- length(unique(legislators))
  
  ## Merge legislator-level covariates
  covars <- background_df[match(background_df$id, ids),] %>% mutate(comm_leader = (Chair + Ranking + Subchair), Safe = as.numeric(Safe),
                                                                    PresParty = as.numeric(PresParty), Male = as.numeric(Gender == "M"),
                                                                    Republican = as.numeric(Minority)) %>%
    select(id, chamber, comm_leader, Safe, Extremity, PresParty, Republican, Male, Lawyer, Businessman, Career.Politician, Seniority)
  
  
  ## Construct map from speeches to hearings.  When we need to recall the order of the hearings, we can just call unique(hearings)
  h_map <- matrix(0, H, N)
  for (h in 1:H){
    h_map[h,] <- as.numeric(hearings == unique(hearings)[h])/sum(hearings == unique(hearings)[h])
  }
  
  ## Construct map from speeches to legislators
  l_map <- matrix(0, N, L)
  for (l in 1:L){
    l_map[,l] <- as.numeric(legislators == unique(legislators)[l])/sum(legislators == unique(legislators)[l])
  }
  
  ## Merge whether each hearing was a subcommittee hearing
  subcommittee <- left_join(x = data.frame("hearingid" = unique(hearings)), 
                            y = read.csv(".//scaling//is_subcomm_hearing.csv"),
                            by = c("hearingid" = "hearingid"))[,2]
  ## Merge whether each hearing was a Senate hearing
  senate <- sapply(unique(hearings), function(x) as.numeric(substr(x, 4, 4) == "s"))
  
  ## Build a legislator-level data frame
  df <- data.frame("senate" = as.numeric(covars$chamber == "senate"), 
                   "comm_leader" = covars$comm_leader,
                   "seniority" = covars$Seniority,
                   "safe" = covars$Safe,
                   "absnom" = covars$Extremity,
                   "pres_copar" = covars$PresParty,
                   "minority" = covars$Republican,
                   "male" = covars$Male,
                   "lawyer" = covars$Lawyer,
                   "businessman" = covars$Businessman,
                   "politician" = covars$Career.Politician,
                   "legid" = legislators)
  
  ## Load member-level attendance data
  a.df <- read.csv(".//scaling//member_level_attendance.csv")
  
  ## Do a bootstrapped, text-level analysis of how informational and confrontational
  #  each text is as a function of its speaker's characteristics
  chacteristicsboot <- function(alpha.infoconf.posterior, df){
    ## Take a posterior draw from the alphas
    S <- dim(alpha.infoconf.posterior)[1]
    alpha.infoconf <- alpha.infoconf.posterior[sample(1:S,1),,]
    ## Concatenate with the legislator data
    boot.df <- data.frame("info" = alpha.infoconf[,1], "conf" = alpha.infoconf[,2])
    ## Resample the texts
    boot.df <- cbind(boot.df, df)
    boot.df <- boot.df[sample(1:nrow(boot.df), nrow(boot.df), replace = TRUE),]
    ## Fit models
    boot.info.coef <- coef(lm(info ~ senate + comm_leader + seniority + safe + 
                                absnom + pres_copar + minority + male + lawyer + 
                                businessman + politician, data = boot.df))
    boot.conf.coef <- coef(lm(conf ~ senate + comm_leader + seniority + safe + 
                                absnom + pres_copar + minority + male + lawyer + 
                                businessman + politician, data = boot.df))
    return(c(boot.info.coef, boot.conf.coef))
  }
  
  ## Do a boot-strapped, legislator-level analysis of how informational and 
  #  confrontational each legislator is as a function of whether they are 
  #  from the same party as the president
  prescoparboot <- function(alpha.infoconf.posterior, df){
    ## Take a posterior draw from the alphas
    S <- dim(alpha.infoconf.posterior)[1]
    alpha.infoconf <- alpha.infoconf.posterior[sample(1:S,1),,]
    boot.df <- data.frame("info" = alpha.infoconf[,1], "conf" = alpha.infoconf[,2])
    
    ## Resample at the legislator level
    L <- length(unique(df$legid))
    boot.leg.ind <- sample(1:L, L, replace = TRUE)
    leg.inds <- which(df$legid == legislators[boot.leg.ind[1]])
    boot.df <- data.frame("info" = alpha.infoconf[leg.inds,1],
                          "conf" = alpha.infoconf[leg.inds,2],
                          "legid" = 1,
                          "pres_copar" = df$pres_copar[leg.inds])
    for (i in 2:length(boot.leg.ind)){
      leg.inds <- which(df$legid == legislators[boot.leg.ind[i]])
      boot.df <- rbind(boot.df, 
                       data.frame("info" = alpha.infoconf[leg.inds,1], 
                                  "conf" = alpha.infoconf[leg.inds,2],
                                  "legid" = i,
                                  "pres_copar" = df$pres_copar[leg.inds]))
    }
    
    ## Fit models with fixed effects
    boot.info.coef <- coef(felm(info ~ pres_copar | legid, data = boot.df))
    boot.conf.coef <- coef(felm(conf ~ pres_copar | legid, data = boot.df))
    return(c(boot.info.coef, boot.conf.coef))
  }
  
  ## Construct a hearing-level dataset of the attendance rate among all eligible legislators
  rate_df <- a.df %>% group_by(hearingid) %>%  summarize(attendees = sum(attended), eligible = n()) %>% 
    mutate(attend_rate = attendees/eligible) %>% select(hearingid, attend_rate, attendees)
  h.df <- data.frame("hearingid" = unique(hearings), "subcommittee" = subcommittee,
                     "senate" = senate) %>%  left_join(rate_df, by = c("hearingid" = "hearingid"))
  
  ## Conduct a hearing-level analysis of attendance as a function of hearing characteristics
  attendanceboot <- function(alpha.infoconf.posterior, h.df, h_map){
    ## Take a posterior draw from the alphas
    S <- dim(alpha.infoconf.posterior)[1]
    alpha.infoconf <- alpha.infoconf.posterior[sample(1:S,1),,]
    
    ## Project into a hearing-level data set, coding the info and conf of the
    #  hearing as the average of all texts in the hearing that were coded
    alpha.infoconf.hearing <- h_map%*%alpha.infoconf
    boot.df <- data.frame("info" = alpha.infoconf.hearing[,1], 
                          "conf" = alpha.infoconf.hearing[,2])
    
    ## Resample at the hearing-level
    boot.df <- cbind(boot.df, h.df)
    boot.df <- boot.df[sample(1:nrow(boot.df), nrow(boot.df), replace = TRUE),]
    boot.attend.coef <- coef(lm(attendees ~ info + conf + subcommittee + 
                                  senate + I(senate*subcommittee), data = boot.df))
    return(boot.attend.coef)
  }
  
  ## Do the same analysis, but with a beta regression of the proportion of
  #  eligible legislators who attended
  attendancebetaboot <- function(alpha.infoconf.posterior, h.df, h_map){
    S <- dim(alpha.infoconf.posterior)[1]
    alpha.infoconf <- alpha.infoconf.posterior[sample(1:S,1),,]
    alpha.infoconf.hearing <- h_map%*%alpha.infoconf
    boot.df <- data.frame("info" = alpha.infoconf.hearing[,1], 
                          "conf" = alpha.infoconf.hearing[,2])
    
    boot.df <- cbind(boot.df, h.df)
    boot.df <- boot.df[sample(1:nrow(boot.df), nrow(boot.df), replace = TRUE),]
    boot.betaattend.coef <- coef(betareg(attend_rate ~ info + conf + subcommittee + 
                                  senate + I(senate*subcommittee), data = boot.df))
    return(boot.betaattend.coef)
  }
  
  ## Conduct a member-hearing-level analysis of who attends as a function of
  #  hearing- and legislator-level characteristics
  infodeterboot <- function(alpha.infoconf.posterior, a.df, h_map){
    ## Take a posterior draw from the alphas
    S <- dim(alpha.infoconf.posterior)[1]
    alpha.infoconf <- alpha.infoconf.posterior[sample(1:S,1),,]
    
    # Get a hearing-level informational a
    alpha.infoconf.hearing <- h_map%*%alpha.infoconf
    boot.df <- data.frame("info" = alpha.infoconf.hearing[,1], 
                          "conf" = alpha.infoconf.hearing[,2],
                          "hearingid" = unique(hearings)) %>%
      left_join(a.df, by = c("hearingid" = "hearingid"))
    
    boot.df <- boot.df[sample(1:nrow(boot.df), nrow(boot.df), replace = TRUE),]
    infodeterboot.coef <- coef(lm(attended ~ info + conf + SubcommHearing + Senate + I(Senate*SubcommHearing) + 
                                  I(CommLeader == 0 & SubcommLeader == 0) + I(info*I(CommLeader == 0 & SubcommLeader == 0)) + 
                                  I(conf*I(CommLeader == 0 & SubcommLeader == 0)) +
                                  Minority + I(info*Minority) + I(conf*Minority) +
                                    I(Seniority > 5) + I(info * (Seniority > 5)) + I(conf * (Seniority > 5)), 
                                  data = boot.df))
    return(infodeterboot.coef)
  }
  
  ## Do the same but restrict attention to subcommittee hearings with non-leader
  #  legislators who don't have to be there.
  infodetersubcommboot <- function(alpha.infoconf.posterior, a.df, h_map){
    S <- dim(alpha.infoconf.posterior)[1]
    alpha.infoconf <- alpha.infoconf.posterior[sample(1:S,1),,]
    alpha.infoconf.hearing <- h_map%*%alpha.infoconf
    
    ## Build a hearing-level data set
    boot.df <- data.frame("info" = alpha.infoconf.hearing[,1], 
                          "conf" = alpha.infoconf.hearing[,2],
                          "hearingid" = unique(hearings)) %>%
      left_join(a.df, by = c("hearingid" = "hearingid"))
    
    ## Merge hearing-level data set with legislato-rlevel attendance and resample
    #  at the hearing level
    boot.df <- boot.df[sample(1:nrow(boot.df), nrow(boot.df), replace = TRUE),]
    infodeterboot.coef <- coef(lm(attended ~ info + conf + Senate + 
                                    I(SubcommLeader == 0) + I(info*I(SubcommLeader == 0)) + 
                                    I(conf*I(SubcommLeader == 0)) +
                                    Minority + I(info*Minority) + I(conf*Minority)  +
                                    I(Seniority > 5) + I(info * (Seniority > 5)) + I(conf * (Seniority > 5)), 
                                  data = boot.df, subset = which(CommLeader == 0 & SubcommHearing == 0)))
    return(infodeterboot.coef)
  }

  
  # Run 5000 bootstrap iterations for each model
  nsims <- 5000
  boot.characteristics <- matrix(NA, nrow = nsims, ncol = 24)
  boot.attendance <- matrix(NA, nrow = nsims, ncol = 6)
  boot.attendancebeta <- matrix(NA, nrow = nsims, ncol = 7)
  boot.infodeter <- matrix(NA, nrow = nsims, ncol = 15)
  boot.infodetersubcomm <- matrix(NA, nrow = nsims, ncol = 13)
  boot.prescopar <- matrix(NA, nrow = nsims, ncol = 2)
  
  for (i in 1:nsims){
    set.seed(i)
    boot.characteristics[i,] <- chacteristicsboot(alpha_posterior, df)
    
    set.seed(i)
    boot.prescopar[i,] <- prescoparboot(alpha_posterior, df)
    
    set.seed(i)
    boot.attendance[i,] <- attendanceboot(alpha_posterior, h.df, h_map)
    
    set.seed(i)
    boot.attendancebeta[i,] <- attendancebetaboot(alpha_posterior, h.df, h_map)
    
    set.seed(i)
    boot.infodeter[i,] <- infodeterboot(alpha_posterior, a.df, h_map)
    
    set.seed(i)
    boot.infodetersubcomm[i,] <- infodetersubcommboot(alpha_posterior, a.df, h_map)
  }
  
  ## Add column names to data
  colnames(boot.characteristics) <- c("InterceptInfo", "SenateInfo", "CommLeaderInfo",
                                      "SeniorityInfo", "SafeInfo", "AbsnomInfo", "PresCoparInfo",
                                      "MinorityInfo", "MaleInfo", "LawyerInfo",
                                      "BusinessmanInfo", "PoliticianInfo",
                                      "InterceptConf", "SenateConf", "CommLeaderConf",
                                      "SeniorityConf", "SafeConf", "AbsnomConf", "PresCoparConf",
                                      "MinorityConf", "MaleConf", "LawyerConf",
                                      "BusinessmanConf", "PoliticianConf")
  colnames(boot.prescopar) <- c("Info", "Conf")
  colnames(boot.attendance) <- c("Intercept", "Info", "Conf", "Subcommittee",
                                 "Senate", "Subcommittee*Senate")  
  colnames(boot.attendancebeta) <- c("Intercept", "Info", "Conf", "Subcommittee",
                                     "Senate", "Subcommittee*Senate", "Phi")
  colnames(boot.infodeter) <- c("Intercept", "Info", "Conf", "Subcommittee", "Senate", "Senate*Subcommittee",
                                "RankAndFile", "Info*RankAndFile", "Conf*RankAndFile",
                                "Minority", "Info*Minority", "Conf*Minority",
                                "Senior", "Info*Senior", "Conf*Senior")   
  colnames(boot.infodetersubcomm) <- c("Intercept", "Info", "Conf", "Senate", 
                                       "RankAndFile", "Info*RankAndFile", "Conf*RankAndFile",
                                       "Minority", "Info*Minority", "Conf*Minority",
                                       "Senior", "Info*Senior", "Conf*Senior")  
  
  ## Extract summary statistics for tables
  out <- list()
  out[["characteristics"]] <- rbind(apply(boot.characteristics, 2, mean),
                                    apply(boot.characteristics, 2, function(x) quantile(x, .025)),
                                    apply(boot.characteristics, 2, function(x) quantile(x, .975)),
                                    apply(boot.characteristics, 2, function(x) mean(x > 0)),
                                    apply(boot.characteristics, 2, function(x) mean(x < 0)))
  
  out[["prescopar"]] <- rbind(apply(boot.prescopar, 2, mean),
                              apply(boot.prescopar, 2, function(x) quantile(x, .025)),
                              apply(boot.prescopar, 2, function(x) quantile(x, .975)),
                              apply(boot.prescopar, 2, function(x) mean(x > 0)),
                              apply(boot.prescopar, 2, function(x) mean(x < 0)))
  
  out[["attendance"]] <- rbind(apply(boot.attendance, 2, mean),
                               apply(boot.attendance, 2, function(x) quantile(x, .025)),
                               apply(boot.attendance, 2, function(x) quantile(x, .975)),
                               apply(boot.attendance, 2, function(x) mean(x > 0)),
                               apply(boot.attendance, 2, function(x) mean(x < 0)))
  
  out[["attendancebeta"]] <- rbind(apply(boot.attendancebeta, 2, mean),
                               apply(boot.attendancebeta, 2, function(x) quantile(x, .025)),
                               apply(boot.attendancebeta, 2, function(x) quantile(x, .975)),
                               apply(boot.attendancebeta, 2, function(x) mean(x > 0)),
                               apply(boot.attendancebeta, 2, function(x) mean(x < 0)))
  
  
  out[["infodeter"]] <- rbind(apply(boot.infodeter, 2, mean),
                               apply(boot.infodeter, 2, function(x) quantile(x, .025)),
                               apply(boot.infodeter, 2, function(x) quantile(x, .975)),
                               apply(boot.infodeter, 2, function(x) mean(x > 0)),
                               apply(boot.infodeter, 2, function(x) mean(x < 0)))

  out[["infodetersubcomm"]] <- rbind(apply(boot.infodetersubcomm, 2, mean),
                              apply(boot.infodetersubcomm, 2, function(x) quantile(x, .025)),
                              apply(boot.infodetersubcomm, 2, function(x) quantile(x, .975)),
                              apply(boot.infodetersubcomm, 2, function(x) mean(x > 0)),
                              apply(boot.infodetersubcomm, 2, function(x) mean(x < 0)))
    
  return(out)
}

## Load legislator covariates for all texts coded by experts and RAs, respectively
expert_background_df <- read.csv(".//scaling//expert_comp.csv")
ra_background_df <- read.csv(".//scaling//ra_comp.csv")

experts_results <- getResults(data[which(data$expert == 1),], expert_background_df, experts_out[["alpha.infoconf.posterior"]])
ras_results <- getResults(data[which(data$ra == 1),], ra_background_df, ras_out[["alpha.infoconf.posterior"]])

## Table 2
experts_results[["prescopar"]]

## Table 3
experts_results[["characteristics"]]

## Table 4
experts_results[["attendance"]]

## Appendix F, Table 2
ras_results[["prescopar"]]

## Appendix F, Table 3
ras_results[["characteristics"]]

## Appendix F, Table 4
ras_results[["attendance"]]

## Appendix H, Table 5
experts_results[["attendancebeta"]]

## Appendix H, Table 6 (Columns 1 and 2, respectively)
experts_results[["infodeter"]]
experts_results[["infodetersubcomm"]]

## Load auditing data
audit_data <- read.csv(".//scaling//audit_comparisons.csv")
audit_out <- get_betas(audit_data, stan.model)
audit_out[["convergence"]]
audit_out[["infoconf_corr_results"]]

## Conduct audit analysis for Appendix G
audit.posterior.means <- data.frame(apply(audit_out[["alpha.infoconf.posterior"]], c(2,3), mean))
colnames(audit.posterior.means) <- c("Informational", "Confrontational")
experts.posterior.means$Id <- unique(c(data$id1[which(data$expert == 1)], data$id2[which(data$expert == 1)]))
audit.posterior.means$Id <- unique(c(audit_data$id1, audit_data$id2))
audit.results <- inner_join(experts.posterior.means, audit.posterior.means, by = c("Id" = "Id"),
                            suffix = c("Experts", "Audit"))
with(audit.results, cor(InformationalExperts, InformationalAudit))
with(audit.results, cor(ConfrontationalExperts, ConfrontationalAudit))



